Introduction

data_test <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-evaluation-data.csv", header = TRUE) %>%select(-INDEX)
data_train <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-training-data.csv", header = TRUE) %>% select(-INDEX)

Data Exploration

Data exploration

dim(data_train)
[1] 2276   16
dim(data_test)
[1] 259  15
# list types for each attribute
sapply(data_train, class)
     TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B  TEAM_BATTING_3B 
       "integer"        "integer"        "integer"        "integer" 
 TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB 
       "integer"        "integer"        "integer"        "integer" 
 TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H TEAM_PITCHING_HR 
       "integer"        "integer"        "integer"        "integer" 
TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
       "integer"        "integer"        "integer"        "integer" 
skim(data_train)
Data summary
Name data_train
Number of rows 2276
Number of columns 16
_______________________
Column type frequency:
numeric 16
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TARGET_WINS 0 1.00 80.79 15.75 0 71.0 82.0 92.00 146 ▁▁▇▅▁
TEAM_BATTING_H 0 1.00 1469.27 144.59 891 1383.0 1454.0 1537.25 2554 ▁▇▂▁▁
TEAM_BATTING_2B 0 1.00 241.25 46.80 69 208.0 238.0 273.00 458 ▁▆▇▂▁
TEAM_BATTING_3B 0 1.00 55.25 27.94 0 34.0 47.0 72.00 223 ▇▇▂▁▁
TEAM_BATTING_HR 0 1.00 99.61 60.55 0 42.0 102.0 147.00 264 ▇▆▇▅▁
TEAM_BATTING_BB 0 1.00 501.56 122.67 0 451.0 512.0 580.00 878 ▁▁▇▇▁
TEAM_BATTING_SO 102 0.96 735.61 248.53 0 548.0 750.0 930.00 1399 ▁▆▇▇▁
TEAM_BASERUN_SB 131 0.94 124.76 87.79 0 66.0 101.0 156.00 697 ▇▃▁▁▁
TEAM_BASERUN_CS 772 0.66 52.80 22.96 0 38.0 49.0 62.00 201 ▃▇▁▁▁
TEAM_BATTING_HBP 2085 0.08 59.36 12.97 29 50.5 58.0 67.00 95 ▂▇▇▅▁
TEAM_PITCHING_H 0 1.00 1779.21 1406.84 1137 1419.0 1518.0 1682.50 30132 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 105.70 61.30 0 50.0 107.0 150.00 343 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 553.01 166.36 0 476.0 536.5 611.00 3645 ▇▁▁▁▁
TEAM_PITCHING_SO 102 0.96 817.73 553.09 0 615.0 813.5 968.00 19278 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 246.48 227.77 65 127.0 159.0 249.25 1898 ▇▁▁▁▁
TEAM_FIELDING_DP 286 0.87 146.39 26.23 52 131.0 149.0 164.00 228 ▁▂▇▆▁
skim(data_test)
Data summary
Name data_test
Number of rows 259
Number of columns 15
_______________________
Column type frequency:
numeric 15
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TEAM_BATTING_H 0 1.00 1469.39 150.66 819 1387.0 1455.0 1548.00 2170 ▁▂▇▁▁
TEAM_BATTING_2B 0 1.00 241.32 49.52 44 210.0 239.0 278.50 376 ▁▂▇▇▂
TEAM_BATTING_3B 0 1.00 55.91 27.14 14 35.0 52.0 72.00 155 ▇▇▃▁▁
TEAM_BATTING_HR 0 1.00 95.63 56.33 0 44.5 101.0 135.50 242 ▆▅▇▃▁
TEAM_BATTING_BB 0 1.00 498.96 120.59 15 436.5 509.0 565.50 792 ▁▁▅▇▁
TEAM_BATTING_SO 18 0.93 709.34 243.11 0 545.0 686.0 912.00 1268 ▁▃▇▇▂
TEAM_BASERUN_SB 13 0.95 123.70 93.39 0 59.0 92.0 151.75 580 ▇▃▁▁▁
TEAM_BASERUN_CS 87 0.66 52.32 23.10 0 38.0 49.5 63.00 154 ▂▇▃▁▁
TEAM_BATTING_HBP 240 0.07 62.37 12.71 42 53.5 62.0 67.50 96 ▃▇▅▁▁
TEAM_PITCHING_H 0 1.00 1813.46 1662.91 1155 1426.5 1515.0 1681.00 22768 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 102.15 57.65 0 52.0 104.0 142.50 336 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 552.42 172.95 136 471.0 526.0 606.50 2008 ▆▇▁▁▁
TEAM_PITCHING_SO 18 0.93 799.67 634.31 0 613.0 745.0 938.00 9963 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 249.75 230.90 73 131.0 163.0 252.00 1568 ▇▁▁▁▁
TEAM_FIELDING_DP 31 0.88 146.06 25.88 69 131.0 148.0 164.00 204 ▁▂▇▇▂

Explore missing variables

data_train %>% 
  gather(variable, value) %>%
  filter(is.na(value)) %>%
  group_by(variable) %>%
  tally() %>%
  mutate(percent = n / nrow(data_train) * 100) %>%
  mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
  arrange(desc(n)) %>%
  rename(`Variable Missing Data` = variable,
         `Number of Records` = n,
         `Share of Total` = percent) %>%
  kable() %>%
  kable_styling()
Variable Missing Data Number of Records Share of Total
TEAM_BATTING_HBP 2085 92%
TEAM_BASERUN_CS 772 34%
TEAM_FIELDING_DP 286 13%
TEAM_BASERUN_SB 131 5.8%
TEAM_BATTING_SO 102 4.5%
TEAM_PITCHING_SO 102 4.5%
data_test %>% 
  gather(variable, value) %>%
  filter(is.na(value)) %>%
  group_by(variable) %>%
  tally() %>%
  mutate(percent = n / nrow(data_train) * 100) %>%
  mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
  arrange(desc(n)) %>%
  rename(`Variable Missing Data` = variable,
         `Number of Records` = n,
         `Share of Total` = percent) %>%
  kable() %>%
  kable_styling()
Variable Missing Data Number of Records Share of Total
TEAM_BATTING_HBP 240 11%
TEAM_BASERUN_CS 87 3.8%
TEAM_FIELDING_DP 31 1.4%
TEAM_BATTING_SO 18 0.8%
TEAM_PITCHING_SO 18 0.8%
TEAM_BASERUN_SB 13 0.6%

Data Visualization

data_train %>% 
  gather() %>% 
  ggplot( aes(x= value)) + 
  geom_density(fill='pink') + 
  facet_wrap(~key, scales = 'free')

Let’s take a closer look at the TEAM_BASERUN_SB

data_train %>% 
  ggplot(aes(TEAM_BASERUN_SB)) + 
  geom_histogram(bins = 50, fill = 'pink') +
  geom_vline(aes(xintercept = mean(TEAM_BASERUN_SB, na.rm = T)), col = "red", lty = 2) +
  geom_vline(aes(xintercept = median(TEAM_BASERUN_SB, na.rm = T)), col = "blue", lty = 2) +
  labs(x = element_blank(),
       y = "Count",
       title = "Distribution of Stolen Bases",
       caption = "* Red line is the mean value and blue is the median") + 
  theme_classic()

ggplot(data_train, aes(x=TARGET_WINS)) + 
    geom_histogram(aes(y=..density..),binwidth = 3, colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic()

Correlations with Response Variable Let’s take a look at how the predictors are correlated with the response variable:

q <- cor(data_train)
ggcorrplot(q, type = "upper", outline.color = "white",
           ggtheme = theme_classic,
           colors = c("#6D9EC1", "white", "#E46726"),
           lab = TRUE, show.legend = FALSE, tl.cex = 8, lab_size = 3) 

MI: - TEAM_PITCHING_HR (homeruns pitched) is slightly correlated win the number of wins. This is clearly unexpected because pitched homeruns are a negative for the pitching team. - TEAM_PITCHING_SO (strikeouts by pitcher) likely suffers from outliers because we’d expect strikeouts to be a positive thing for the pitching team. There is a large influential point. - TEAM_PITCHING_BB (walks allowed by pitcher) might suffer from outliers. We’d expected a negative correlation since a walk allowed is a negative for the pitching team.

data_train %>%
  gather(variable, value, -TARGET_WINS) %>%
  ggplot(., aes(value, TARGET_WINS)) + 
  geom_point(fill = "pink", color="pink") + 
  geom_smooth(method = "lm", se = FALSE, color = "red") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = "Wins")


Data Preparation

Imputation using KNN

sum(is.na(data_train))/prod(dim(data_train))
[1] 0.09550747
data_train <- data_train %>%
  mutate(TEAM_BATTING_SO = ifelse(TEAM_BATTING_SO == 0, NA, TEAM_BATTING_SO)) %>%
  mutate(TEAM_PITCHING_SO = ifelse(TEAM_PITCHING_SO > 5346, NA, TEAM_PITCHING_SO)) %>%
  select(-TEAM_BATTING_HBP)

set.seed(42)
knn <- data_train %>% knnImputation()
impute_me <- is.na(data_train$TEAM_BATTING_SO)
data_train[impute_me,"TEAM_BATTING_SO"] <- knn[impute_me,"TEAM_BATTING_SO"] 
impute_me <- is.na(data_train$TEAM_BASERUN_SB)
data_train[impute_me,"TEAM_BASERUN_SB"] <- knn[impute_me,"TEAM_BASERUN_SB"] 
impute_me <- is.na(data_train$TEAM_BASERUN_CS)
data_train[impute_me,"TEAM_BASERUN_CS"] <- knn[impute_me,"TEAM_BASERUN_CS"] 
impute_me <- is.na(data_train$TEAM_PITCHING_SO)
data_train[impute_me,"TEAM_PITCHING_SO"] <- knn[impute_me,"TEAM_PITCHING_SO"]
impute_me <- is.na(data_train$TEAM_FIELDING_DP)
data_train[impute_me,"TEAM_FIELDING_DP"] <- knn[impute_me,"TEAM_FIELDING_DP"]
sum(is.na(data_train))/prod(dim(data_train))
[1] 0

Do the same for the data_test

sum(is.na(data_test))/prod(dim(data_test))
[1] 0.1047619
data_test <- data_test %>%
  mutate(TEAM_BATTING_SO = ifelse(TEAM_BATTING_SO == 0, NA, TEAM_BATTING_SO)) %>%
  mutate(TEAM_PITCHING_SO = ifelse(TEAM_PITCHING_SO > 5346, NA, TEAM_PITCHING_SO)) %>%
  select(-TEAM_BATTING_HBP)

set.seed(42)
knn <- data_test %>% knnImputation()
impute_me <- is.na(data_test$TEAM_BATTING_SO)
data_test[impute_me,"TEAM_BATTING_SO"] <- knn[impute_me,"TEAM_BATTING_SO"] 
impute_me <- is.na(data_test$TEAM_BASERUN_SB)
data_test[impute_me,"TEAM_BASERUN_SB"] <- knn[impute_me,"TEAM_BASERUN_SB"] 
impute_me <- is.na(data_test$TEAM_BASERUN_CS)
data_test[impute_me,"TEAM_BASERUN_CS"] <- knn[impute_me,"TEAM_BASERUN_CS"] 
impute_me <- is.na(data_test$TEAM_PITCHING_SO)
data_test[impute_me,"TEAM_PITCHING_SO"] <- knn[impute_me,"TEAM_PITCHING_SO"]
impute_me <- is.na(data_test$TEAM_FIELDING_DP)
data_test[impute_me,"TEAM_FIELDING_DP"] <- knn[impute_me,"TEAM_FIELDING_DP"]
sum(is.na(data_test))/prod(dim(data_test))
[1] 0

Data Transformation

# New variable: TEAM_BATTING_1B
temp_data <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-training-data.csv", header = TRUE) %>% select(-INDEX)
base_data <- temp_data
mod_data <- base_data %>% mutate(TEAM_BATTING_1B = base_data$TEAM_BATTING_H - select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE))
head(mod_data)
data_transformed <- mod_data

#Log transform TEAM_BASERUN_CS
data_transformed$TEAM_BASERUN_CS_tform <-log(data_transformed$TEAM_BASERUN_CS)
baserun_cs <- ggplot(data_transformed, aes(x=TEAM_BASERUN_CS)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_BASERUN_CS")
baserun_cs_tf <- ggplot(data_transformed, aes(x=TEAM_BASERUN_CS_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "Log Transformed")

#Log transform TEAM_BASERUN_SB
data_transformed$TEAM_BASERUN_SB_tform <-log(data_transformed$TEAM_BASERUN_SB)
baserun_sb <- ggplot(data_transformed, aes(x=TEAM_BASERUN_SB)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_BASERUN_SB")
baserun_sb_tf <- ggplot(data_transformed, aes(x=TEAM_BASERUN_SB_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "Log Transformed")

#Log transform TEAM_BATTING_3B
data_transformed$TEAM_BATTING_3B_tform <-log(data_transformed$TEAM_BATTING_3B)
batting_3b <- ggplot(data_transformed, aes(x=TEAM_BATTING_3B)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_BATTING_3B")
batting_3b_tf <- ggplot(data_transformed, aes(x=TEAM_BATTING_3B_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "Log Transformed")

#BoxCoxtransform TEAM_BATTING_BB
data_transformed$TEAM_BATTING_BB_tform <- BoxCox(data_transformed$TEAM_BATTING_BB, BoxCoxLambda(data_transformed$TEAM_BATTING_BB))
batting_bb <- ggplot(data_transformed, aes(x=TEAM_BATTING_BB)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_BATTING_BB")
batting_bb_tf <- ggplot(data_transformed, aes(x=TEAM_BATTING_BB_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "BoxCox Transformed")

#BoxCoxtransform TEAM_BATTING_H
data_transformed$TEAM_BATTING_H_tform <- BoxCox(data_transformed$TEAM_BATTING_H, BoxCoxLambda(data_transformed$TEAM_BATTING_H))
batting_h <- ggplot(data_transformed, aes(x=TEAM_BATTING_H)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_BATTING_H")
batting_h_tf <- ggplot(data_transformed, aes(x=TEAM_BATTING_H_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "BoxCox Transformed")

#BoxCoxtransform TEAM_BATTING_1B
data_transformed$TEAM_BATTING_1B_tform <- BoxCox(data_transformed$TEAM_BATTING_1B, BoxCoxLambda(data_transformed$TEAM_BATTING_1B))
batting_1b <- ggplot(data_transformed, aes(x=TEAM_BATTING_1B)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_BATTING_1B")
batting_1b_tf <- ggplot(data_transformed, aes(x=TEAM_BATTING_1B_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "BoxCox Transformed")

#BoxCoxtransform TEAM_FIELDING_E
data_transformed$TEAM_FIELDING_E_tform <- BoxCox(data_transformed$TEAM_FIELDING_E, BoxCoxLambda(data_transformed$TEAM_FIELDING_E))
fielding_e <- ggplot(data_transformed, aes(x=TEAM_FIELDING_E)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_FIELDING_E")
fielding_e_tf <- ggplot(data_transformed, aes(x=TEAM_FIELDING_E_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "BoxCox Transformed")

#Log transform TEAM_PITCHING_BB
data_transformed$TEAM_PITCHING_BB_tform <-log(data_transformed$TEAM_PITCHING_BB)
pitching_bb <- ggplot(data_transformed, aes(x=TEAM_PITCHING_BB)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_PITCHING_BB")
pitching_bb_tf <- ggplot(data_transformed, aes(x=TEAM_PITCHING_BB_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "Log Transformed")

#BoxCoxtransform TEAM_PITCHING_H
data_transformed$TEAM_PITCHING_H_tform <- BoxCox(data_transformed$TEAM_PITCHING_H, BoxCoxLambda(data_transformed$TEAM_PITCHING_H))
pitching_h <- ggplot(data_transformed, aes(x=TEAM_PITCHING_H)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_PITCHING_H")
pitching_h_tf <- ggplot(data_transformed, aes(x=TEAM_PITCHING_H_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "BoxCox Transformed")

#Log transform TEAM_PITCHING_SO
data_transformed$TEAM_PITCHING_SO_tform <-log(data_transformed$TEAM_PITCHING_SO)
pitching_so <- ggplot(data_transformed, aes(x=TEAM_PITCHING_SO)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "TEAM_PITCHING_SO")
pitching_so_tf <- ggplot(data_transformed, aes(x=TEAM_PITCHING_SO_tform)) + 
    geom_histogram(aes(y=..density..), colour="black", fill="red") +
    geom_density(alpha=.8, fill="pink") + 
  theme_classic() + labs(title = "Log Transformed")
plot_grid(baserun_cs, baserun_cs_tf, baserun_sb, baserun_sb_tf,
          batting_3b, batting_3b_tf, batting_bb, batting_bb_tf,
          batting_h, batting_h_tf, batting_1b, batting_1b_tf, 
          fielding_e, fielding_e_tf, pitching_bb, pitching_bb_tf, 
          pitching_h, pitching_h_tf, pitching_so, pitching_so_tf, 
          ncol = 2)

Do the same for the test set

#Test data transformations to match model
temp_test <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-evaluation-data.csv", header = TRUE) %>%select(-INDEX)

mod_data_test <- temp_test %>% mutate(TEAM_BATTING_1B = TEAM_BATTING_H - select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE))

test_data_transformed <- mod_data_test

#Log transform TEAM_BASERUN_CS
test_data_transformed$TEAM_BASERUN_CS_tform <-log(test_data_transformed$TEAM_BASERUN_CS)

#Log transform TEAM_BASERUN_SB
test_data_transformed$TEAM_BASERUN_SB_tform <-log(test_data_transformed$TEAM_BASERUN_SB)

#Log transform TEAM_BATTING_3B
test_data_transformed$TEAM_BATTING_3B_tform <-log(test_data_transformed$TEAM_BATTING_3B)

#BoxCoxtransform TEAM_BATTING_BB
test_data_transformed$TEAM_BATTING_BB_tform <- BoxCox(test_data_transformed$TEAM_BATTING_BB, BoxCoxLambda(test_data_transformed$TEAM_BATTING_BB))

#BoxCoxtransform TEAM_BATTING_H
test_data_transformed$TEAM_BATTING_H_tform <- BoxCox(test_data_transformed$TEAM_BATTING_H, BoxCoxLambda(test_data_transformed$TEAM_BATTING_H))

#BoxCoxtransform TEAM_BATTING_1B
test_data_transformed$TEAM_BATTING_1B_tform <- BoxCox(test_data_transformed$TEAM_BATTING_1B, BoxCoxLambda(test_data_transformed$TEAM_BATTING_1B))

#BoxCoxtransform TEAM_FIELDING_E
test_data_transformed$TEAM_FIELDING_E_tform <- BoxCox(test_data_transformed$TEAM_FIELDING_E, BoxCoxLambda(test_data_transformed$TEAM_FIELDING_E))

#Log transform TEAM_PITCHING_BB
test_data_transformed$TEAM_PITCHING_BB_tform <-log(test_data_transformed$TEAM_PITCHING_BB)

#BoxCoxtransform TEAM_PITCHING_H
test_data_transformed$TEAM_PITCHING_H_tform <- BoxCox(test_data_transformed$TEAM_PITCHING_H, BoxCoxLambda(test_data_transformed$TEAM_PITCHING_H))

#Log transform TEAM_PITCHING_SO
test_data_transformed$TEAM_PITCHING_SO_tform <-log(test_data_transformed$TEAM_PITCHING_SO)

Feature engineering

MI: this passage is just for reference when dealing with leverage point. Can be removed later Outliers & Leverage Points

In summary, an outlier is a point whose standardized residual falls outside the interval from –2 to 2. Recall that a bad leverage point is a leverage point which is also an outlier. Thus, a bad leverage point is a leverage point whose standar- dized residual falls outside the interval from –2 to 2. On the other hand, a good leverage point is a leverage point whose standardized residual falls inside the interval from –2 to 2.

Recall that the rule for simple linear regression for classifying a point as a leverage point is hii > 4/n .

temp_holder <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-training-data.csv", header = TRUE) %>% select(-INDEX)

add_features <- function(df){
  df %>%
    mutate(TEAM_BATTING_1B = TEAM_BATTING_H - TEAM_BATTING_2B - TEAM_BATTING_3B - TEAM_BATTING_HR)
}

data_train_mi <- add_features(temp_holder)
# Creating "at bats" variable representing every time a batter steps up to bat
data_train_mi <- data_train_mi %>% mutate(TEAM_BATTING_AB = TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP + TEAM_BATTING_SO)
# Creating "batting average" variable
data_train_mi <- data_train_mi %>% mutate(TEAM_BATTING_AVG = TEAM_BATTING_H/TEAM_BATTING_AB)
# Creating "on base percentage" representing the proportion of ways to get a base out of total opportunities to hit the ball
data_train_mi <- data_train_mi %>% mutate(TEAM_BATTING_OBP = (TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP)/(TEAM_BATTING_AB + TEAM_BATTING_BB + TEAM_BATTING_HBP))
# Creating "slugging percentage" which is a weighted sum of hits by number of bases acquired divided by opportunities to hit the ball
data_train_mi <- data_train_mi %>% mutate(TEAM_BATTING_SLG = (TEAM_BATTING_1B + 2*TEAM_BATTING_2B + 3*TEAM_BATTING_3B + 3*TEAM_BATTING_HR)/TEAM_BATTING_AB)

Encapsulating the at above into a function. (Use only one of these in final report). Function makes more sense for re-use in prediction

add_advanced_bb_features <- function(df) {
  df %>% 
    mutate(TEAM_BATTING_AB = TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP + TEAM_BATTING_SO) %>%
    mutate(TEAM_BATTING_AVG = TEAM_BATTING_H/TEAM_BATTING_AB) %>%
    mutate(TEAM_BATTING_OBP = (TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP)/(TEAM_BATTING_AB + TEAM_BATTING_BB + TEAM_BATTING_HBP)) %>%
    mutate(TEAM_BATTING_SLG = (TEAM_BATTING_1B + 2*TEAM_BATTING_2B + 3*TEAM_BATTING_3B + 3*TEAM_BATTING_HR)/TEAM_BATTING_AB)
}
data_train_mi

MI: Too many values are removed. These engineered features rely on TEAM_BATTING_HBP in the calculations but this is the variable with the most missing data. Treatment of this variable will affect downstream results.


Models Building

Model_1.1 (without transformation)

Salma

# simple model
m1 <- lm(TARGET_WINS ~., data = data_train, na.action = na.omit)

summary(m1)

Call:
lm(formula = TARGET_WINS ~ ., data = data_train, na.action = na.omit)

Residuals:
    Min      1Q  Median      3Q     Max 
-49.112  -8.463   0.047   8.364  61.603 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       6.9438573  5.8754934   1.182 0.237396    
TEAM_BATTING_H    0.0521644  0.0037624  13.865  < 2e-16 ***
TEAM_BATTING_2B  -0.0146400  0.0091385  -1.602 0.109290    
TEAM_BATTING_3B   0.0383492  0.0172017   2.229 0.025886 *  
TEAM_BATTING_HR   0.0768270  0.0281262   2.732 0.006354 ** 
TEAM_BATTING_BB   0.0013952  0.0049562   0.281 0.778354    
TEAM_BATTING_SO  -0.0019862  0.0033961  -0.585 0.558720    
TEAM_BASERUN_SB   0.0040365  0.0054119   0.746 0.455838    
TEAM_BASERUN_CS   0.0973314  0.0155751   6.249 4.92e-10 ***
TEAM_PITCHING_H  -0.0004574  0.0003778  -1.211 0.226086    
TEAM_PITCHING_HR  0.0030711  0.0247709   0.124 0.901341    
TEAM_PITCHING_BB  0.0108895  0.0032308   3.371 0.000763 ***
TEAM_PITCHING_SO -0.0023276  0.0022387  -1.040 0.298585    
TEAM_FIELDING_E  -0.0216762  0.0024116  -8.988  < 2e-16 ***
TEAM_FIELDING_DP -0.0961580  0.0139737  -6.881 7.65e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 12.95 on 2261 degrees of freedom
Multiple R-squared:  0.3284,    Adjusted R-squared:  0.3242 
F-statistic: 78.96 on 14 and 2261 DF,  p-value: < 2.2e-16

plot(m1)


Model_1.2 (Backward elimination)

m2 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HR +TEAM_BATTING_3B +TEAM_BASERUN_CS+TEAM_FIELDING_E+ TEAM_FIELDING_DP, data = data_train)
summary(m2)

Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HR + 
    TEAM_BATTING_3B + TEAM_BASERUN_CS + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
    data = data_train)

Residuals:
    Min      1Q  Median      3Q     Max 
-46.717  -8.663  -0.059   8.538  65.935 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       4.473998   3.915403   1.143 0.253298    
TEAM_BATTING_H    0.051169   0.002467  20.741  < 2e-16 ***
TEAM_BATTING_HR   0.073968   0.007638   9.684  < 2e-16 ***
TEAM_BATTING_3B   0.061358   0.016375   3.747 0.000183 ***
TEAM_BASERUN_CS   0.107941   0.012081   8.935  < 2e-16 ***
TEAM_FIELDING_E  -0.023632   0.001584 -14.924  < 2e-16 ***
TEAM_FIELDING_DP -0.078465   0.013791  -5.690 1.44e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 13.06 on 2269 degrees of freedom
Multiple R-squared:  0.3144,    Adjusted R-squared:  0.3126 
F-statistic: 173.4 on 6 and 2269 DF,  p-value: < 2.2e-16

plot(m2)


Model_1.3 (polynomial regression)

full_formula <- "TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP + I(TEAM_BATTING_2B^2) + I(TEAM_BATTING_3B^2) + I(TEAM_BATTING_HR^2) + I(TEAM_BATTING_BB^2) + I(TEAM_BATTING_SO^2) + I(TEAM_BASERUN_SB^2) + I(TEAM_BASERUN_CS^2) + I(TEAM_PITCHING_H^2) + I(TEAM_PITCHING_HR^2) + I(TEAM_PITCHING_BB^2) + I(TEAM_PITCHING_SO^2) + I(TEAM_FIELDING_E^2) + I(TEAM_FIELDING_DP^2)  + I(TEAM_BATTING_2B^3) + I(TEAM_BATTING_3B^3) + I(TEAM_BATTING_HR^3) + I(TEAM_BATTING_BB^3) + I(TEAM_BATTING_SO^3) + I(TEAM_BASERUN_SB^3) + I(TEAM_BASERUN_CS^3) + I(TEAM_PITCHING_H^3) + I(TEAM_PITCHING_HR^3) + I(TEAM_PITCHING_BB^3) + I(TEAM_PITCHING_SO^3) + I(TEAM_FIELDING_E^3) + I(TEAM_FIELDING_DP^3)  + I(TEAM_BATTING_2B^4) + I(TEAM_BATTING_3B^4) + I(TEAM_BATTING_HR^4) + I(TEAM_BATTING_BB^4) + I(TEAM_BATTING_SO^4) + I(TEAM_BASERUN_SB^4) + I(TEAM_BASERUN_CS^4) + I(TEAM_PITCHING_H^4) + I(TEAM_PITCHING_HR^4) + I(TEAM_PITCHING_BB^4) + I(TEAM_PITCHING_SO^4) + I(TEAM_FIELDING_E^4) + I(TEAM_FIELDING_DP^4) "

m3 <- lm(full_formula, data_train)
step_back <- MASS::stepAIC(m3, direction="backward", trace = F)
poly_call <- summary(step_back)$call
step_back <- lm(poly_call[2], data_train)
summary(step_back)

Call:
lm(formula = poly_call[2], data = data_train)

Residuals:
    Min      1Q  Median      3Q     Max 
-54.736  -7.452   0.017   7.344  61.619 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -1.055e+01  5.511e+01  -0.191 0.848198    
TEAM_BATTING_2B        1.519e+00  1.809e-01   8.396  < 2e-16 ***
TEAM_BATTING_3B        3.764e-01  1.871e-01   2.012 0.044312 *  
TEAM_BATTING_BB        7.492e-01  9.617e-02   7.790 1.02e-14 ***
TEAM_BASERUN_SB        4.464e-02  1.191e-02   3.748 0.000182 ***
TEAM_PITCHING_H        4.189e-02  3.584e-03  11.689  < 2e-16 ***
TEAM_PITCHING_BB      -6.975e-02  1.195e-02  -5.836 6.14e-09 ***
TEAM_PITCHING_SO      -2.545e-02  4.404e-03  -5.779 8.57e-09 ***
TEAM_FIELDING_E       -2.289e-01  2.252e-02 -10.166  < 2e-16 ***
TEAM_FIELDING_DP      -3.639e+00  1.629e+00  -2.233 0.025639 *  
I(TEAM_BATTING_2B^2)  -5.951e-03  7.210e-04  -8.254 2.58e-16 ***
I(TEAM_BATTING_3B^2)  -5.323e-03  3.696e-03  -1.440 0.149948    
I(TEAM_BATTING_HR^2)  -2.369e-03  9.823e-04  -2.412 0.015967 *  
I(TEAM_BATTING_BB^2)  -2.499e-03  3.194e-04  -7.825 7.78e-15 ***
I(TEAM_BATTING_SO^2)   3.270e-05  8.862e-06   3.690 0.000229 ***
I(TEAM_BASERUN_CS^2)  -1.157e-03  8.030e-04  -1.441 0.149837    
I(TEAM_PITCHING_H^2)  -4.327e-06  4.723e-07  -9.161  < 2e-16 ***
I(TEAM_PITCHING_HR^2)  3.015e-03  7.715e-04   3.909 9.56e-05 ***
I(TEAM_PITCHING_SO^2)  2.245e-06  1.159e-06   1.937 0.052925 .  
I(TEAM_FIELDING_E^2)   3.929e-04  5.385e-05   7.296 4.10e-13 ***
I(TEAM_FIELDING_DP^2)  3.887e-02  1.806e-02   2.153 0.031455 *  
I(TEAM_BATTING_2B^3)   7.573e-06  9.411e-07   8.047 1.36e-15 ***
I(TEAM_BATTING_3B^3)   4.442e-05  2.803e-05   1.585 0.113173    
I(TEAM_BATTING_HR^3)   1.575e-05  7.177e-06   2.194 0.028302 *  
I(TEAM_BATTING_BB^3)   3.690e-06  4.879e-07   7.564 5.68e-14 ***
I(TEAM_BATTING_SO^3)  -2.345e-08  5.875e-09  -3.992 6.76e-05 ***
I(TEAM_BASERUN_SB^3)  -2.227e-07  1.443e-07  -1.543 0.122933    
I(TEAM_BASERUN_CS^3)   2.134e-05  9.901e-06   2.156 0.031216 *  
I(TEAM_PITCHING_H^3)   1.688e-10  2.696e-11   6.262 4.54e-10 ***
I(TEAM_PITCHING_HR^3) -1.683e-05  4.989e-06  -3.374 0.000753 ***
I(TEAM_PITCHING_BB^3)  1.061e-08  2.997e-09   3.540 0.000408 ***
I(TEAM_FIELDING_E^3)  -3.019e-07  4.806e-08  -6.281 4.02e-10 ***
I(TEAM_FIELDING_DP^3) -1.854e-04  8.639e-05  -2.146 0.031983 *  
I(TEAM_BATTING_3B^4)  -1.265e-07  6.979e-08  -1.813 0.069995 .  
I(TEAM_BATTING_HR^4)  -2.933e-08  1.495e-08  -1.962 0.049907 *  
I(TEAM_BATTING_BB^4)  -1.874e-09  2.652e-10  -7.064 2.16e-12 ***
I(TEAM_BASERUN_SB^4)   3.276e-10  2.064e-10   1.587 0.112665    
I(TEAM_BASERUN_CS^4)  -6.719e-08  3.306e-08  -2.032 0.042243 *  
I(TEAM_PITCHING_H^4)  -2.130e-15  4.961e-16  -4.295 1.82e-05 ***
I(TEAM_PITCHING_HR^4)  2.649e-08  8.868e-09   2.988 0.002842 ** 
I(TEAM_PITCHING_BB^4) -1.522e-12  6.952e-13  -2.189 0.028685 *  
I(TEAM_FIELDING_E^4)   7.607e-11  1.416e-11   5.371 8.66e-08 ***
I(TEAM_FIELDING_DP^4)  3.253e-07  1.511e-07   2.153 0.031439 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 11.95 on 2233 degrees of freedom
Multiple R-squared:  0.4351,    Adjusted R-squared:  0.4245 
F-statistic: 40.95 on 42 and 2233 DF,  p-value: < 2.2e-16

plot(step_back)


Model_2.1 (using transformation)

Using Transformation - Dhairav

#data with all transformed variables
m4_data <- select(data_transformed,-TEAM_BASERUN_CS, -TEAM_BASERUN_SB, -TEAM_BATTING_3B, -TEAM_BATTING_BB, -TEAM_BATTING_H, -TEAM_FIELDING_E, -TEAM_PITCHING_BB, -TEAM_PITCHING_H, -TEAM_PITCHING_SO, -TEAM_BATTING_HBP, -TEAM_BASERUN_CS, -TEAM_BATTING_1B)
m4 <- lm(TARGET_WINS ~., data = m4_data)
summary(m4)

Call:
lm(formula = TARGET_WINS ~ ., data = m4_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-33.994  -6.794   0.131   6.903  31.095 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -5.713e+04  3.049e+04  -1.873 0.061213 .  
TEAM_BATTING_2B        -5.845e-02  2.991e-02  -1.954 0.050836 .  
TEAM_BATTING_HR         8.265e-02  7.497e-02   1.102 0.270442    
TEAM_BATTING_SO        -3.289e-02  9.775e-03  -3.365 0.000786 ***
TEAM_PITCHING_HR        2.690e-02  6.422e-02   0.419 0.675433    
TEAM_FIELDING_DP       -1.084e-01  1.343e-02  -8.069 1.46e-15 ***
TEAM_BASERUN_CS_tform   9.830e-01  1.067e+00   0.921 0.356943    
TEAM_BASERUN_SB_tform   3.803e+00  8.515e-01   4.466 8.57e-06 ***
TEAM_BATTING_3B_tform   6.310e+00  1.456e+00   4.335 1.56e-05 ***
TEAM_BATTING_BB_tform   5.846e-05  2.713e-05   2.155 0.031329 *  
TEAM_BATTING_H_tform    8.573e+04  7.161e+04   1.197 0.231415    
TEAM_BATTING_1B_tform   1.446e+04  3.134e+04   0.462 0.644473    
TEAM_FIELDING_E_tform  -1.168e+03  9.249e+01 -12.630  < 2e-16 ***
TEAM_PITCHING_BB_tform  4.444e+00  8.335e+00   0.533 0.594005    
TEAM_PITCHING_H_tform  -4.167e+04  2.492e+04  -1.672 0.094777 .  
TEAM_PITCHING_SO_tform  8.444e+00  7.733e+00   1.092 0.275075    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.828 on 1470 degrees of freedom
  (790 observations deleted due to missingness)
Multiple R-squared:  0.4066,    Adjusted R-squared:  0.4005 
F-statistic: 67.14 on 15 and 1470 DF,  p-value: < 2.2e-16

Model_2.2 (Backward elimination)

Using Stepwise backward elimination

m4a <- update(m4, . ~ . -TEAM_PITCHING_HR)
# summary(m4a)
m4b <- update(m4a, . ~ . -TEAM_BATTING_1B_tform)
# summary(m4b)
m4c <- update(m4b, . ~ . -TEAM_PITCHING_BB_tform)
# summary(m4c)
m4d <- update(m4c, . ~ . -TEAM_PITCHING_SO_tform)
# summary(m4d)
m4e <- update(m4d, . ~ . -TEAM_PITCHING_H_tform)
summary(m4e)

Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_HR + 
    TEAM_BATTING_SO + TEAM_FIELDING_DP + TEAM_BASERUN_CS_tform + 
    TEAM_BASERUN_SB_tform + TEAM_BATTING_3B_tform + TEAM_BATTING_BB_tform + 
    TEAM_BATTING_H_tform + TEAM_FIELDING_E_tform, data = m4_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-33.525  -7.082   0.271   6.968  31.845 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -6.994e+04  9.958e+03  -7.024 3.29e-12 ***
TEAM_BATTING_2B       -7.585e-02  9.613e-03  -7.890 5.85e-15 ***
TEAM_BATTING_HR        9.978e-02  9.736e-03  10.249  < 2e-16 ***
TEAM_BATTING_SO       -2.190e-02  2.503e-03  -8.750  < 2e-16 ***
TEAM_FIELDING_DP      -1.084e-01  1.340e-02  -8.086 1.27e-15 ***
TEAM_BASERUN_CS_tform  7.508e-01  1.061e+00   0.707    0.479    
TEAM_BASERUN_SB_tform  4.026e+00  8.372e-01   4.809 1.67e-06 ***
TEAM_BATTING_3B_tform  5.518e+00  9.491e-01   5.814 7.45e-09 ***
TEAM_BATTING_BB_tform  7.186e-05  6.130e-06  11.723  < 2e-16 ***
TEAM_BATTING_H_tform   7.145e+04  9.968e+03   7.168 1.20e-12 ***
TEAM_FIELDING_E_tform -1.190e+03  9.144e+01 -13.015  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.835 on 1475 degrees of freedom
  (790 observations deleted due to missingness)
Multiple R-squared:  0.4037,    Adjusted R-squared:  0.3997 
F-statistic: 99.88 on 10 and 1475 DF,  p-value: < 2.2e-16
plot(m4e)


Model_3.1 (feature engineering)

Mael

data_train_mi %>%
  gather(variable, value, -c(TARGET_WINS:TEAM_BATTING_1B)) %>%
  ggplot(., aes(value, TARGET_WINS)) + 
  geom_point(fill = "pink", color="pink") + 
  geom_smooth(method = "lm", se = FALSE, color = "red") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = "Wins")

mi_m1 <- lm(TARGET_WINS ~ TEAM_BATTING_AB + TEAM_BATTING_AVG + TEAM_BATTING_OBP + TEAM_BATTING_SLG, data = data_train_mi)
summary(mi_m1)

Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_AB + TEAM_BATTING_AVG + 
    TEAM_BATTING_OBP + TEAM_BATTING_SLG, data = data_train_mi)

Residuals:
     Min       1Q   Median       3Q      Max 
-22.7778  -6.4023  -0.3835   5.8471  24.0195 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -2.438e+02  3.023e+01  -8.066 8.70e-14 ***
TEAM_BATTING_AB   2.083e-02  6.001e-03   3.471 0.000644 ***
TEAM_BATTING_AVG -6.611e+02  1.111e+02  -5.950 1.31e-08 ***
TEAM_BATTING_OBP  8.939e+02  1.156e+02   7.733 6.42e-13 ***
TEAM_BATTING_SLG  1.049e+02  3.922e+01   2.674 0.008161 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.105 on 186 degrees of freedom
  (2085 observations deleted due to missingness)
Multiple R-squared:  0.447, Adjusted R-squared:  0.4351 
F-statistic: 37.59 on 4 and 186 DF,  p-value: < 2.2e-16

The standardized residual plots show quite a few points outside the -2,2 range, which might justify removing those observations.

Residuals vs Fitted: while the line is not quite horizontal, the constant variance assumption seems met Normal Q-Q plot: normality assumption is met Root(Squared Residuals) vs Fitted Values: Residuals vs Leverage: a few points have standardized residuals outside the (-2,2) ranhe which might justify removing those observations.

plot(mi_m1)

MI: tyring out predictions on this model

# data_test_mi <- add_features(data_test)
# data_test_mi <- add_advanced_bb_features(data_test_mi)
# pred.w.plim <- predict(mi_m1, data_test_mi, interval = "prediction")
# pred.w.clim <- predict(mi_m1, data_test_mi, interval = "confidence")

Model selection